{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "from pprint import pprint\n",
    "sampleId_unique_overlap = pd.read_excel(\"library.20200424.xlsx\", sheet_name=\"临床信息表(clinicalDf_overlap_libraryDf)\").loc[:,\"样本编号\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "filenames = []\n",
    "df_list = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (127, 9); shape of test_df_filt: (96, 9); \n"
    }
   ],
   "source": [
    "# 处理\"BGI健康人 20190905.xlsx\"\n",
    "test_df = pd.read_excel(\"BGI健康人 20190905.xlsx\", sheet_name=\"2 样本提取表\")\n",
    "test_df_filt = test_df.loc[(test_df[\"样本编号\"].isin(sampleId_unique_overlap)) & (test_df[\"样本类型\"]==\"血浆\")]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))\n",
    "filenames.append(\"BGI健康人 20190905.xlsx\")\n",
    "df_list.append(test_df_filt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (129, 71); shape of test_df_filt: (43, 71); \n"
    }
   ],
   "source": [
    "# 处理\"肺癌.xlsx\"\n",
    "test_df = pd.read_excel(\"肺癌.xlsx\", sheet_name=0)\n",
    "test_df_filt = test_df.loc[test_df[\"华大编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame({\n",
    "    \"DNA提取编号\": test_df_filt.loc[:,\"华大编号\"].str.replace(re.compile(r'$'), \"_1\"),\n",
    "    \"样本编号\": test_df_filt.loc[:,\"华大编号\"],\n",
    "    \"提取日期\": [np.nan for x in test_df_filt.index],\n",
    "    \"样本类型\": [\"血浆\" for x in test_df_filt.index],\n",
    "    \"样本体积\": [np.nan for x in test_df_filt.index],\n",
    "    \"提取方法\": [np.nan for x in test_df_filt.index],\n",
    "    \"浓度\": [0 for x in test_df_filt.index],\n",
    "    \"体积\": [0 for x in test_df_filt.index],\n",
    "    \"备注\": [np.nan for x in test_df_filt.index]\n",
    "    })\n",
    "\n",
    "filenames.append(\"肺癌.xlsx\")\n",
    "df_list.append(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (24, 28); shape of test_df_filt: (0, 28); \n"
    }
   ],
   "source": [
    "# 处理\"福建肠癌.xlsx\"\n",
    "test_df = pd.read_excel(\"福建肠癌.xlsx\", sheet_name=0)\n",
    "test_df_filt = test_df.loc[test_df[\"华大编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (182, 27); shape of test_df_filt: (66, 27); \n"
    }
   ],
   "source": [
    "# 处理\"广州南医五院第二批样本 20191119.xlsx\"\n",
    "test_df = pd.read_excel(\"广州南医五院第二批样本 20191119.xlsx\", sheet_name=0)\n",
    "test_df_filt = test_df.loc[test_df[\"华大编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame({\n",
    "    \"DNA提取编号\": test_df_filt.loc[:,\"华大编号\"].str.replace(re.compile(r'$'), \"_1\"),\n",
    "    \"样本编号\": test_df_filt.loc[:,\"华大编号\"],\n",
    "    \"提取日期\": [np.nan for x in test_df_filt.index],\n",
    "    \"样本类型\": [\"血浆\" for x in test_df_filt.index],\n",
    "    \"样本体积\": test_df_filt.loc[:,\"血浆体积（ml）\"],\n",
    "    \"提取方法\": [np.nan for x in test_df_filt.index],\n",
    "    \"浓度\": test_df_filt.loc[:,\"cfDNA浓度\"],\n",
    "    \"体积\": test_df_filt.loc[:,\"cfDNA体积\"],\n",
    "    \"备注\": [np.nan for x in test_df_filt.index]\n",
    "    })\n",
    "\n",
    "filenames.append(\"广州南医五院第二批样本 20191119.xlsx\")\n",
    "df_list.append(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (105, 9); shape of test_df_filt: (22, 9); \n"
    }
   ],
   "source": [
    "# 处理\"广州南医五院第一批样本 20190905.xlsx\"\n",
    "test_df = pd.read_excel(\"广州南医五院第一批样本 20190905.xlsx\", sheet_name=\"2 样本提取表\")\n",
    "test_df_filt = test_df.loc[(test_df[\"样本编号\"].isin(sampleId_unique_overlap)) & (test_df[\"样本类型\"]==\"血浆\")]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "filenames.append(\"广州南医五院第一批样本 20190905.xlsx\")\n",
    "df_list.append(test_df_filt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (97, 9); shape of test_df_filt: (53, 9); \n"
    }
   ],
   "source": [
    "# 处理\"湖南省肿瘤医院结直肠癌 20191107.xlsx\"\n",
    "test_df = pd.read_excel(\"湖南省肿瘤医院结直肠癌 20191107.xlsx\", sheet_name=\"2 样本提取表\")\n",
    "test_df_filt = test_df.loc[(test_df[\"样本编号\"].isin(sampleId_unique_overlap)) & (test_df[\"样本类型\"]==\"血浆\")]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "filenames.append(\"湖南省肿瘤医院结直肠癌 20191107.xlsx\")\n",
    "df_list.append(test_df_filt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df1: (128, 35); shape of test_df1_filt: (0, 35); \nshape of test_df2: (15, 18); shape of test_df2_filt: (0, 18); \n"
    }
   ],
   "source": [
    "# 处理\"华大肠癌项目-样本信息登记表.xlsx\"\n",
    "test_df1 = pd.read_excel(\"华大肠癌项目-样本信息登记表.xlsx\", sheet_name=\"肠癌\")\n",
    "test_df1_filt = test_df1.loc[test_df1[\"华大编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df1: {}; shape of test_df1_filt: {}; \".format(test_df1.shape, test_df1_filt.shape))\n",
    "test_df2 = pd.read_excel(\"华大肠癌项目-样本信息登记表.xlsx\", sheet_name=\"健康人\")\n",
    "test_df2_filt = test_df2.loc[test_df2[\"全血编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df2: {}; shape of test_df2_filt: {}; \".format(test_df2.shape, test_df2_filt.shape))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (9, 28); shape of test_df_filt: (0, 28); \n"
    }
   ],
   "source": [
    "# 处理\"南通肝癌.xlsx\"\n",
    "test_df = pd.read_excel(\"南通肝癌.xlsx\", sheet_name=0)\n",
    "test_df_filt = test_df.loc[test_df[\"华大编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (235, 26); shape of test_df_filt: (62, 26); \n"
    }
   ],
   "source": [
    "# 处理\"上海肺癌.xlsx\"\n",
    "test_df = pd.read_excel(\"上海肺癌.xlsx\", sheet_name=0)\n",
    "test_df_filt = test_df.loc[test_df[\"华大编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame({\n",
    "    \"DNA提取编号\": test_df_filt.loc[:,\"华大编号\"].str.replace(re.compile(r'$'), \"_1\"),\n",
    "    \"样本编号\": test_df_filt.loc[:,\"华大编号\"],\n",
    "    \"提取日期\": [np.nan for x in test_df_filt.index],\n",
    "    \"样本类型\": [\"血浆\" for x in test_df_filt.index],\n",
    "    \"样本体积\": test_df_filt.loc[:,\"血浆体积（ml）\"],\n",
    "    \"提取方法\": [np.nan for x in test_df_filt.index],\n",
    "    \"浓度\": test_df_filt.loc[:,\"cfDNA浓度\"],\n",
    "    \"体积\": test_df_filt.loc[:,\"cfDNA体积\"],\n",
    "    \"备注\": [np.nan for x in test_df_filt.index]\n",
    "    })\n",
    "\n",
    "filenames.append(\"上海肺癌.xlsx\")\n",
    "df_list.append(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (172, 9); shape of test_df_filt: (117, 9); \n"
    }
   ],
   "source": [
    "# 处理\"上海复旦中山胰腺癌 20191107.xlsx\"\n",
    "test_df = pd.read_excel(\"上海复旦中山胰腺癌 20191107.xlsx\", sheet_name=\"2 样本提取表\")\n",
    "test_df_filt = test_df.loc[(test_df[\"样本编号\"].isin(sampleId_unique_overlap)) & (test_df[\"样本类型\"]==\"血浆\")]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "filenames.append(\"上海复旦中山胰腺癌 20191107.xlsx\")\n",
    "df_list.append(test_df_filt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (426, 28); shape of test_df_filt: (212, 28); \n"
    }
   ],
   "source": [
    "# 处理\"上海肝癌.xlsx\"\n",
    "test_df = pd.read_excel(\"上海肝癌.xlsx\", sheet_name=0)\n",
    "test_df_filt = test_df.loc[test_df[\"华大编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame({\n",
    "    \"DNA提取编号\": test_df_filt.loc[:,\"华大编号\"].str.replace(re.compile(r'$'), \"_1\"),\n",
    "    \"样本编号\": test_df_filt.loc[:,\"华大编号\"],\n",
    "    \"提取日期\": [np.nan for x in test_df_filt.index],\n",
    "    \"样本类型\": [\"血浆\" for x in test_df_filt.index],\n",
    "    \"样本体积\": test_df_filt.loc[:,\"血浆体积（ml）\"],\n",
    "    \"提取方法\": test_df_filt.loc[:,\"提取方法\"],\n",
    "    \"浓度\": test_df_filt.loc[:,\"cfDNA浓度\"],\n",
    "    \"体积\": test_df_filt.loc[:,\"cfDNA体积\"],\n",
    "    \"备注\": [np.nan for x in test_df_filt.index]\n",
    "    })\n",
    "\n",
    "filenames.append(\"上海肝癌.xlsx\")\n",
    "df_list.append(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (41, 22); shape of test_df_filt: (20, 22); \n"
    }
   ],
   "source": [
    "# 处理\"上海肝硬化.xlsx\"\n",
    "test_df = pd.read_excel(\"上海肝硬化.xlsx\", sheet_name=0)\n",
    "test_df_filt = test_df.loc[test_df[\"华大编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame({\n",
    "    \"DNA提取编号\": test_df_filt.loc[:,\"华大编号\"].str.replace(re.compile(r'$'), \"_1\"),\n",
    "    \"样本编号\": test_df_filt.loc[:,\"华大编号\"],\n",
    "    \"提取日期\": [np.nan for x in test_df_filt.index],\n",
    "    \"样本类型\": [\"血浆\" for x in test_df_filt.index],\n",
    "    \"样本体积\": test_df_filt.loc[:,\"血浆体积（ml）\"],\n",
    "    \"提取方法\": test_df_filt.loc[:,\"提取方法\"],\n",
    "    \"浓度\": test_df_filt.loc[:,\"cfDNA浓度\"],\n",
    "    \"体积\": test_df_filt.loc[:,\"cfDNA体积\"],\n",
    "    \"备注\": [np.nan for x in test_df_filt.index]\n",
    "    })\n",
    "\n",
    "filenames.append(\"上海肝硬化.xlsx\")\n",
    "df_list.append(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (60, 16); shape of test_df_filt: (0, 16); \n"
    }
   ],
   "source": [
    "# 处理\"上海组织.xlsx\"\n",
    "test_df = pd.read_excel(\"上海组织.xlsx\", sheet_name=0)\n",
    "test_df_filt = test_df.loc[test_df[\"华大编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (171, 14); shape of test_df_filt: (0, 14); \n"
    }
   ],
   "source": [
    "# 处理\"胃癌组织.xlsx\"\n",
    "test_df = pd.read_excel(\"胃癌组织.xlsx\", sheet_name=0)\n",
    "test_df_filt = test_df.loc[test_df[\"华大虚拟编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (41, 30); shape of test_df_filt: (17, 30); \n"
    }
   ],
   "source": [
    "# 处理\"中大六院肠癌早筛-样本信息登记表(回顾性两批)L.xlsx\"\n",
    "test_df = pd.read_excel(\"中大六院肠癌早筛-样本信息登记表(回顾性两批)L.xlsx\", sheet_name=0)\n",
    "test_df_filt = test_df.loc[test_df[\"华大编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame({\n",
    "    \"DNA提取编号\": test_df_filt.loc[:,\"华大编号\"].str.replace(re.compile(r'$'), \"_1\"),\n",
    "    \"样本编号\": test_df_filt.loc[:,\"华大编号\"],\n",
    "    \"提取日期\": [np.nan for x in test_df_filt.index],\n",
    "    \"样本类型\": [\"血浆\" for x in test_df_filt.index],\n",
    "    \"样本体积\": test_df_filt.loc[:,\"血浆体积（ml）\"],\n",
    "    \"提取方法\": test_df_filt.loc[:,\"提取方法\"],\n",
    "    \"浓度\": test_df_filt.loc[:,\"cfDNA浓度\"],\n",
    "    \"体积\": test_df_filt.loc[:,\"cfDNA体积\"],\n",
    "    \"备注\": [np.nan for x in test_df_filt.index]\n",
    "    })\n",
    "\n",
    "filenames.append(\"中大六院肠癌早筛-样本信息登记表(回顾性两批)L.xlsx\")\n",
    "df_list.append(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (254, 28); shape of test_df_filt: (190, 28); \n"
    }
   ],
   "source": [
    "# 处理\"中大六院肠癌早筛-样本信息登记表2020-3-31.xlsx\"\n",
    "test_df = pd.read_excel(\"中大六院肠癌早筛-样本信息登记表2020-3-31.xlsx\", sheet_name=0)\n",
    "test_df_filt = test_df.loc[test_df[\"华大编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.DataFrame({\n",
    "    \"DNA提取编号\": test_df_filt.loc[:,\"华大编号\"].str.replace(re.compile(r'$'), \"_1\"),\n",
    "    \"样本编号\": test_df_filt.loc[:,\"华大编号\"],\n",
    "    \"提取日期\": [np.nan for x in test_df_filt.index],\n",
    "    \"样本类型\": [\"血浆\" for x in test_df_filt.index],\n",
    "    \"样本体积\": test_df_filt.loc[:,\"血浆体积（ml）\"],\n",
    "    \"提取方法\": test_df_filt.loc[:,\"提取方法\"],\n",
    "    \"浓度\": test_df_filt.loc[:,\"cfDNA浓度\"],\n",
    "    \"体积\": test_df_filt.loc[:,\"cfDNA体积\"],\n",
    "    \"备注\": [np.nan for x in test_df_filt.index]\n",
    "    })\n",
    "\n",
    "filenames.append(\"中大六院肠癌早筛-样本信息登记表2020-3-31.xlsx\")\n",
    "df_list.append(data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df1: (128, 34); shape of test_df1_filt: (0, 34); \nshape of test_df2: (15, 6); shape of test_df2_filt: (0, 6); \n"
    }
   ],
   "source": [
    "# 处理\"中山大学肿瘤医院-肠癌.xlsx\"\n",
    "test_df1 = pd.read_excel(\"中山大学肿瘤医院-肠癌.xlsx\", sheet_name=\"肠癌\")\n",
    "test_df1_filt = test_df1.loc[test_df1[\"华大编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df1: {}; shape of test_df1_filt: {}; \".format(test_df1.shape, test_df1_filt.shape))\n",
    "test_df2 = pd.read_excel(\"中山大学肿瘤医院-肠癌.xlsx\", sheet_name=\"健康人\")\n",
    "test_df2_filt = test_df2.loc[test_df2[\"全血编号\"].isin(sampleId_unique_overlap)]\n",
    "print(\"shape of test_df2: {}; shape of test_df2_filt: {}; \".format(test_df2.shape, test_df2_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "shape of test_df: (112, 9); shape of test_df_filt: (112, 9); \n"
    }
   ],
   "source": [
    "# 处理\"中山肝癌MRD-20191113.xlsx\"\n",
    "test_df = pd.read_excel(\"中山肝癌MRD-20191113.xlsx\", sheet_name=\"2 样本提取表\")\n",
    "test_df_filt = test_df.loc[(test_df[\"样本编号\"].isin(sampleId_unique_overlap)) & (test_df[\"样本类型\"]==\"血浆\")]\n",
    "print(\"shape of test_df: {}; shape of test_df_filt: {}; \".format(test_df.shape, test_df_filt.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "filenames.append(\"中山肝癌MRD-20191113.xlsx\")\n",
    "df_list.append(test_df_filt)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": "len of filenames: 12; len of df_list: 12\n"
    }
   ],
   "source": [
    "print(\"len of filenames: {}; len of df_list: {}\".format(len(filenames), len(df_list)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "with pd.ExcelWriter(\"extract.20200426.xlsx\") as writer:\n",
    "    for idx in range(len(filenames)):\n",
    "        df_list[idx].to_excel(writer, sheet_name=filenames[idx], index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "output_type": "execute_result",
     "data": {
      "text/plain": "<bound method NDFrame.head of          DNA提取编号        样本编号       提取日期 样本类型  样本体积  提取方法     浓度  体积   备注\n0   19P3758198_1  19P3758198 2019-04-24   血浆   9.0   NaN  0.608  73  NaN\n1   19P3758199_1  19P3758199 2019-04-24   血浆   5.0   NaN  0.928  42  NaN\n2   19P3758200_1  19P3758200 2019-04-24   血浆   5.5   NaN  0.772  42  NaN\n3   19P3758201_1  19P3758201 2019-04-24   血浆   9.5   NaN  0.540  73  NaN\n4   19P3758202_1  19P3758202 2019-04-24   血浆   9.1   NaN  0.470  73  NaN\n..           ...         ...        ...  ...   ...   ...    ...  ..  ...\n91  19P3758293_1  19P3758293 2019-04-25   血浆   4.9   NaN  0.574  42  NaN\n92  19P3758294_1  19P3758294 2019-04-25   血浆   5.5   NaN  0.514  42  NaN\n93  19P3758295_1  19P3758295 2019-04-25   血浆   8.4   NaN  0.306  62  NaN\n94  19P3758296_1  19P3758296 2019-04-25   血浆   8.0   NaN  0.548  42  NaN\n95  19P3758297_1  19P3758297 2019-04-25   血浆   8.0   NaN  0.788  42  NaN\n\n[96 rows x 9 columns]>"
     },
     "metadata": {},
     "execution_count": 39
    }
   ],
   "source": [
    "df_list[0].head"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "sample_id_df_list = {}\n",
    "for idx in range(len(filenames)):\n",
    "    df = df_list[idx]\n",
    "    for idx2 in range(df.shape[0]):\n",
    "        row = df.index[idx2]\n",
    "        if df.loc[row, \"样本编号\"] in sample_id_df_list:\n",
    "            sample_id_df_list[df.loc[row, \"样本编号\"]].append(filenames[idx])\n",
    "        else:\n",
    "            sample_id_df_list[df.loc[row, \"样本编号\"]] = [filenames[idx]]\n",
    "\n",
    "for k in sample_id_df_list:\n",
    "    if len(sample_id_df_list[k])>1:\n",
    "        print(\"key: {}; value: {}\".format(k, sample_id_df_list[k]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sample_id_df_list"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "python36764bit590e0209625c41dab6b137cee9f052d1",
   "display_name": "Python 3.6.7 64-bit"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}